##################################################################
##################################################################
## Replication Material
## Widmann & Wich: Creating and Comparing Dictionary, Word Embedding, and Transformer-based 
## Models to Measure Discrete Emotions in German Political Text
## Political Analysis
## tobias.widmann@eui.eu
##
## Script 01: Replication of the Main Analysis
##################################################################
##################################################################

# Note: The file 000_readme.pdf describes all scripts and datasets required to replicate the analysis

# This script was run on the following R version, platform and OS:
# R version 4.0.5 (2021-03-31)
# Platform: x86_64-apple-darwin17.0 (64-bit)
# Running under: macOS Big Sur 11.5.1


sessionInfo()

#### Set Working Directory to the Replication Folder# ###########################

# Delete hashtag below and fill in the directory of the replication folder
#setwd("")

#### Load Packages ##############################################################

library(quanteda)   # Version 3.0.0
library(keras)      # Version 2.6.0
library(corpus)     # Version 0.10.1
library(ggplot2)    # Version 3.3.3
library(openxlsx)   # Version 4.2.3
library(stringr)    # Version 1.4.0
library(syuzhet)    # Version 1.0.6
library(readr)      # Version 1.4.0
library(stargazer)  # Version 5.2.2

#### DATA 1 #########################################################################

#### Load raw crowd-coded data ###################################################

load("./data1_raw.Rdata")


## Create Emotional Variables
# First, we create variables based on the answers given by the crowd-coders
# To do so, we count the number of times a sentence received a specific emotional code
# and add them together

data1_raw$h_anger <- 0
data1_raw$h_fear <- 0
data1_raw$h_disgust <- 0
data1_raw$h_sadness <- 0
data1_raw$h_joy <- 0
data1_raw$h_enthusiasm <- 0
data1_raw$h_pride <- 0
data1_raw$h_hope <- 0
data1_raw$h_none <- 0
data1_raw$h_uncodable <- 0

list1 <- as.list(data1_raw$TextID)
list1 <- unique(list1)


for (i in c(list1[1:length(list1)])){
  occur <- str_count(data1_raw$Answer.1[data1_raw$TextID==i], "Ärger")
  data1_raw$h_anger[data1_raw$TextID==i] <- data1_raw$h_anger[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.2[data1_raw$TextID==i], "Ärger")
  data1_raw$h_anger[data1_raw$TextID==i] <- data1_raw$h_anger[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.3[data1_raw$TextID==i], "Ärger")
  data1_raw$h_anger[data1_raw$TextID==i] <- data1_raw$h_anger[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.4[data1_raw$TextID==i], "Ärger")
  data1_raw$h_anger[data1_raw$TextID==i] <- data1_raw$h_anger[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  
  
  occur <- str_count(data1_raw$Answer.1[data1_raw$TextID==i], "Angst")
  data1_raw$h_fear[data1_raw$TextID==i] <- data1_raw$h_fear[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.2[data1_raw$TextID==i], "Angst")
  data1_raw$h_fear[data1_raw$TextID==i] <- data1_raw$h_fear[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.3[data1_raw$TextID==i], "Angst")
  data1_raw$h_fear[data1_raw$TextID==i] <- data1_raw$h_fear[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.4[data1_raw$TextID==i], "Angst")
  data1_raw$h_fear[data1_raw$TextID==i] <- data1_raw$h_fear[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  
  
  occur <- str_count(data1_raw$Answer.1[data1_raw$TextID==i], "Ekel")
  data1_raw$h_disgust[data1_raw$TextID==i] <- data1_raw$h_disgust[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.2[data1_raw$TextID==i], "Ekel")
  data1_raw$h_disgust[data1_raw$TextID==i] <- data1_raw$h_disgust[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.3[data1_raw$TextID==i], "Ekel")
  data1_raw$h_disgust[data1_raw$TextID==i] <- data1_raw$h_disgust[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.4[data1_raw$TextID==i], "Ekel")
  data1_raw$h_disgust[data1_raw$TextID==i] <- data1_raw$h_disgust[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  
  
  occur <- str_count(data1_raw$Answer.1[data1_raw$TextID==i], "Traurigkeit")
  data1_raw$h_sadness[data1_raw$TextID==i] <- data1_raw$h_sadness[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.2[data1_raw$TextID==i], "Traurigkeit")
  data1_raw$h_sadness[data1_raw$TextID==i] <- data1_raw$h_sadness[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.3[data1_raw$TextID==i], "Traurigkeit")
  data1_raw$h_sadness[data1_raw$TextID==i] <- data1_raw$h_sadness[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.4[data1_raw$TextID==i], "Traurigkeit")
  data1_raw$h_sadness[data1_raw$TextID==i] <- data1_raw$h_sadness[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  
  
  occur <- str_count(data1_raw$Answer.1[data1_raw$TextID==i], "Freude")
  data1_raw$h_joy[data1_raw$TextID==i] <- data1_raw$h_joy[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.2[data1_raw$TextID==i], "Freude")
  data1_raw$h_joy[data1_raw$TextID==i] <- data1_raw$h_joy[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.3[data1_raw$TextID==i], "Freude")
  data1_raw$h_joy[data1_raw$TextID==i] <- data1_raw$h_joy[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.4[data1_raw$TextID==i], "Freude")
  data1_raw$h_joy[data1_raw$TextID==i] <- data1_raw$h_joy[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  
  
  occur <- str_count(data1_raw$Answer.1[data1_raw$TextID==i], "Enthusiasmus")
  data1_raw$h_enthusiasm[data1_raw$TextID==i] <- data1_raw$h_enthusiasm[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.2[data1_raw$TextID==i], "Enthusiasmus")
  data1_raw$h_enthusiasm[data1_raw$TextID==i] <- data1_raw$h_enthusiasm[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.3[data1_raw$TextID==i], "Enthusiasmus")
  data1_raw$h_enthusiasm[data1_raw$TextID==i] <- data1_raw$h_enthusiasm[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.4[data1_raw$TextID==i], "Enthusiasmus")
  data1_raw$h_enthusiasm[data1_raw$TextID==i] <- data1_raw$h_enthusiasm[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  
  occur <- str_count(data1_raw$Answer.1[data1_raw$TextID==i], "Stolz")
  data1_raw$h_pride[data1_raw$TextID==i] <- data1_raw$h_pride[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.2[data1_raw$TextID==i], "Stolz")
  data1_raw$h_pride[data1_raw$TextID==i] <- data1_raw$h_pride[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.3[data1_raw$TextID==i], "Stolz")
  data1_raw$h_pride[data1_raw$TextID==i] <- data1_raw$h_pride[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.4[data1_raw$TextID==i], "Stolz")
  data1_raw$h_pride[data1_raw$TextID==i] <- data1_raw$h_pride[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  
  occur <- str_count(data1_raw$Answer.1[data1_raw$TextID==i], "Hoffnung")
  data1_raw$h_hope[data1_raw$TextID==i] <- data1_raw$h_hope[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.2[data1_raw$TextID==i], "Hoffnung")
  data1_raw$h_hope[data1_raw$TextID==i] <- data1_raw$h_hope[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.3[data1_raw$TextID==i], "Hoffnung")
  data1_raw$h_hope[data1_raw$TextID==i] <- data1_raw$h_hope[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.4[data1_raw$TextID==i], "Hoffnung")
  data1_raw$h_hope[data1_raw$TextID==i] <- data1_raw$h_hope[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  
  occur <- str_count(data1_raw$Answer.1[data1_raw$TextID==i], "Keine Emotion")
  data1_raw$h_none[data1_raw$TextID==i] <- data1_raw$h_none[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.2[data1_raw$TextID==i], "Keine Emotion")
  data1_raw$h_none[data1_raw$TextID==i] <- data1_raw$h_none[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.3[data1_raw$TextID==i], "Keine Emotion")
  data1_raw$h_none[data1_raw$TextID==i] <- data1_raw$h_none[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.4[data1_raw$TextID==i], "Keine Emotion")
  data1_raw$h_none[data1_raw$TextID==i] <- data1_raw$h_none[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  
  occur <- str_count(data1_raw$Answer.1[data1_raw$TextID==i], "Nicht kodierbar")
  data1_raw$h_uncodable[data1_raw$TextID==i] <- data1_raw$h_uncodable[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.2[data1_raw$TextID==i], "Nicht kodierbar")
  data1_raw$h_uncodable[data1_raw$TextID==i] <- data1_raw$h_uncodable[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.3[data1_raw$TextID==i], "Nicht kodierbar")
  data1_raw$h_uncodable[data1_raw$TextID==i] <- data1_raw$h_uncodable[data1_raw$TextID==i] + sum(occur, na.rm=T) 
  
  occur <- str_count(data1_raw$Answer.4[data1_raw$TextID==i], "Nicht kodierbar")
  data1_raw$h_uncodable[data1_raw$TextID==i] <- data1_raw$h_uncodable[data1_raw$TextID==i] + sum(occur, na.rm=T) 
}



## Delete doubles
# Once answers have been counted, we delete multiple rows in order to receive 10,000 sentences
data1_raw$UnitID <- NULL #delete unnecessary columns
data1_raw$Guru.ID <- NULL #delete unnecessary columns

data1_raw <- data1_raw[!duplicated(data1_raw$TextID),]


## Add source variable
# We add the source for each sentence (Facebook or Parliamentary Speech)
load("./source_info.Rdata")

data1_raw <- cbind(data1_raw, source_info)
data1_raw$text <- data1_raw$ID <- NULL

#### Apply tools #########################################################################
# As a next step, we apply the different dictionaries and machine learning classifiers

##### ed8 ######

### This script applies the ed8 dictionary to a data frame with a text column
### The code in this script is based on Christian Rauh's "augmented dictionary"
### Rauh, C. (2018). Validating a sentiment dictionary for German political language. A workbench note. Journal of Information Technology & Politics, 0(0), 1–25. https://doi.org/10.1080/19331681. 2018.1485608

# Preprocessing
data1_raw$sent.text <- tolower(data1_raw$Text) # Everything to lower case
data1_raw$sent.text <- str_trim(data1_raw$sent.text, side = "both") #get rid of white space on both ends
data1_raw$sent.text <- gsub(" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", "", data1_raw$sent.text, fixed = FALSE) # Remove URL Links
data1_raw$sent.text <- paste(" ", data1_raw$sent.text, " ", sep="") # Add white space in the beginning and end
data1_raw$sent.text <- gsub("[[:punct:]]", "", data1_raw$sent.text, fixed = FALSE) # Remove punctuation

data1_raw$sent.text <- gsub("  ", " ", data1_raw$sent.text, fixed = TRUE) # Remove doubled whitespaces

### Count length of text ###

data1_raw$terms <- str_count(data1_raw$sent.text, " ") - 1  # Number of terms based on number of spaces (preceding whitespace subtracted)

# Removing stopwords from length count
# Using the German Stopword list supplied with the Snowball Stemmer
stopwords <- data.frame(readLines("./SnowballStopwordsGerman.txt"))
names(stopwords) <- "term"
stopwords$term <- sub("\\|.*", "", stopwords$term, fixed = FALSE) # Remove comments from file
stopwords <- stopwords[grepl("[a-z]", stopwords$term), ] # keep only lines with terms
stopwords <- paste(stopwords, collapse = " | ") # Regular 'or' expression
stopwords <- paste(" ", stopwords, " ", sep = "") # Add whitespace left and right
stopwords <- paste(stopwords, "| dass ", sep = "") 
stopwords

data1_raw$stopwords <- str_count(data1_raw$sent.text, stopwords) # Count stopwords in sentence
data1_raw$terms.raw <- data1_raw$terms # Copy raw term count
data1_raw$terms <- data1_raw$terms.raw - data1_raw$stopwords # Correct term count by subtracting stopwords
data1_raw$terms[data1_raw$terms == 0] <- 1 # To avoid dividing by zero, doesn't change sentiment because no weighted terms can be in there

### EMODIC ###

load("./negative_ed8.Rdata") 
load("./ed8.Rdata") 

data1_raw$sent.text2 <- data1_raw$sent.text # Copy of the sampled sent.texts

# Loop over negation dictionary, and replace instances in text
for (i in 1:nrow(negative_ed8)){
  data1_raw$sent.text2 <- gsub(negative_ed8$pattern[i], negative_ed8$replacement[i], data1_raw$sent.text2, fixed = FALSE)
}

# Scoring

angerterms <- ed8[ed8$anger == 1, c(1,2)]
fearterms <- ed8[ed8$fear == 1, c(1,3)]
disgustterms <- ed8[ed8$disgust == 1, c(1,4)]
sadnessterms <- ed8[ed8$sadness == 1, c(1,5)]
joyterms <- ed8[ed8$joy == 1, c(1,6)]
enthusiasmterms <- ed8[ed8$enthusiasm == 1, c(1,7)]
prideterms <- ed8[ed8$pride == 1, c(1,8)]
hopeterms <- ed8[ed8$hope == 1, c(1,9)]

data1_raw$anger <- 0

for (i in 1:nrow(angerterms)) {
  occur <- str_count(data1_raw$sent.text2, angerterms$feature[i])
  data1_raw$anger <- data1_raw$anger + occur                      
}

data1_raw$fear <- 0

for (i in 1:nrow(fearterms)) {
  occur <- str_count(data1_raw$sent.text2, fearterms$feature[i])
  data1_raw$fear <- data1_raw$fear + occur                      
}

data1_raw$disgust <- 0

for (i in 1:nrow(disgustterms)) {
  occur <- str_count(data1_raw$sent.text2, disgustterms$feature[i])
  data1_raw$disgust <- data1_raw$disgust + occur                      
}

data1_raw$sadness <- 0

for (i in 1:nrow(sadnessterms)) {
  occur <- str_count(data1_raw$sent.text2, sadnessterms$feature[i])
  data1_raw$sadness <- data1_raw$sadness + occur                      
}

data1_raw$joy <- 0

for (i in 1:nrow(joyterms)) {
  occur <- str_count(data1_raw$sent.text2, joyterms$feature[i])
  data1_raw$joy <- data1_raw$joy + occur                      
}

data1_raw$enthusiasm <- 0

for (i in 1:nrow(enthusiasmterms)) {
  occur <- str_count(data1_raw$sent.text2, enthusiasmterms$feature[i])
  data1_raw$enthusiasm <- data1_raw$enthusiasm + occur                      
}

data1_raw$pride <- 0

for (i in 1:nrow(prideterms)) {
  occur <- str_count(data1_raw$sent.text2, prideterms$feature[i])
  data1_raw$pride <- data1_raw$pride + occur                      
}

data1_raw$hope <- 0

for (i in 1:nrow(hopeterms)) {
  occur <- str_count(data1_raw$sent.text2, hopeterms$feature[i])
  data1_raw$hope <- data1_raw$hope + occur                      
}


# Normalized emotional score
data1_raw$anger.norm <- data1_raw$anger / data1_raw$terms
data1_raw$fear.norm <- data1_raw$fear / data1_raw$terms
data1_raw$disgust.norm <- data1_raw$disgust / data1_raw$terms
data1_raw$sadness.norm <- data1_raw$sadness / data1_raw$terms
data1_raw$joy.norm <- data1_raw$joy / data1_raw$terms
data1_raw$enthusiasm.norm <- data1_raw$enthusiasm / data1_raw$terms
data1_raw$pride.norm <- data1_raw$pride / data1_raw$terms
data1_raw$hope.norm <- data1_raw$hope / data1_raw$terms

# Remove temporary files
rm(angerterms, fearterms, disgustterms, sadnessterms, joyterms,
   enthusiasmterms, prideterms, hopeterms)


##### NRC Dictionary #############
# Now we apply the German version of the NRC dictionary 
# as included in the syuzhet package

# Apply
data1_raw$nrc_data <- get_nrc_sentiment(data1_raw$sent.text2, language = "german")

# Normalize
data1_raw$nrc.anger.norm <- data1_raw$nrc_data$anger / data1_raw$terms
data1_raw$nrc.fear.norm <- data1_raw$nrc_data$fear / data1_raw$terms
data1_raw$nrc.disgust.norm <- data1_raw$nrc_data$disgust / data1_raw$terms
data1_raw$nrc.sadness.norm <- data1_raw$nrc_data$sadness / data1_raw$terms
data1_raw$nrc.joy.norm <- data1_raw$nrc_data$joy / data1_raw$terms

data1_raw$nrc_data <- NULL


##### LIWC Dictionary ###########
# Note that the LIWC dictionaries cannot be added to the Dataverse due to copyright 
# reasons. If you want to reproduce the findings of the paper or appendix 
# and do not have the LIWC dictionaries, you can skip this step and continue with the 
# following scripts. The following scripts rely on the already prepared data file 
# called data1_prepared.Rdata and data2_prepared.Rdata


# To apply the LIWC dictionary, you first turn the R dataframe into a csv that can be used
# in the LIWC stand-alone application
write.csv(data1_raw,'./liwc.csv')

# In the LIWC stand-alone application you need to choose the Internal German Dictionary 2015
# Under "Category Options", de-select all categories except the "Affect" category
# Run the analysis and then save the output file in the working directory with the name "liwc_results.csv"

# After using the LIWC application, read the results file back into the R environment
liwc_results <- read.csv('./liwc_results.csv', header = TRUE)

# Delete first row with which entails former column names
liwc_results <- liwc_results[-1,]

# Only keep last columns that were created by the LIWC app
liwc_results <- liwc_results[,c(46:51)]

# Rename the columns
colnames(liwc_results)[4] <- "liwc.fear"
colnames(liwc_results)[5] <- "liwc.anger"
colnames(liwc_results)[6] <- "liwc.sad"

# Turn the columns into numeric, because R read them as characters
liwc_results$liwc.anger <- gsub(",",".",liwc_results$liwc.anger)
liwc_results$liwc.anger <- as.numeric(liwc_results$liwc.anger)

liwc_results$liwc.fear <- gsub(",",".",liwc_results$liwc.fear)
liwc_results$liwc.fear <- as.numeric(liwc_results$liwc.fear)

liwc_results$liwc.sad <- gsub(",",".",liwc_results$liwc.sad)
liwc_results$liwc.sad <- as.numeric(liwc_results$liwc.sad)

# Normalize the LIWC variables to bring them between 0 and 1
liwc_results$liwc.anger.norm <- (liwc_results$liwc.anger-min(liwc_results$liwc.anger))/(max(liwc_results$liwc.anger)-min(liwc_results$liwc.anger))
liwc_results$liwc.fear.norm <- (liwc_results$liwc.fear-min(liwc_results$liwc.fear))/(max(liwc_results$liwc.fear)-min(liwc_results$liwc.fear))
liwc_results$liwc.sad.norm <- (liwc_results$liwc.sad-min(liwc_results$liwc.sad))/(max(liwc_results$liwc.sad)-min(liwc_results$liwc.sad))

# Add them to the main dataframe
data1_raw <- cbind(data1_raw, liwc_results)


##### Delete uncodables #######################################################
# Delete sentences that have been coded by more than one crowd-coders as "uncodable"
data1_raw <- data1_raw[data1_raw$h_uncodable<2,]


##### Dummy variables ########################################################
# Lastly, we create binary variables for human coding and for the dictionary results
# variables starting with hf_... are based on human coding
# variables starting with df_... are based on the ed8 dictionary
# variables starting with nf_... are based on the NRC dictionary
# variables starting with lf_... are based on the LIWC dictionary
# Note: the variables based on the word embeddings and transfromer-based results are
# already binary and do not need to be transformed

# Human coding
data1_raw$hf_anger <- 0
data1_raw$hf_anger[data1_raw$h_anger>0] <- 1

# ed8 dictionary
data1_raw$df_anger <- 0
data1_raw$df_anger[data1_raw$anger.norm>0] <- 1

# NRC dictionary
data1_raw$nf_anger <- 0
data1_raw$nf_anger[data1_raw$nrc.anger.norm>0] <- 1

# LIWC dictionary
data1_raw$lf_anger <- 0
data1_raw$lf_anger[data1_raw$liwc.anger.norm>0] <- 1


data1_raw$hf_fear <- 0
data1_raw$hf_fear[data1_raw$h_fear>0] <- 1

data1_raw$df_fear <- 0
data1_raw$df_fear[data1_raw$fear.norm>0] <- 1

data1_raw$nf_fear <- 0
data1_raw$nf_fear[data1_raw$nrc.fear.norm>0] <- 1

data1_raw$lf_fear <- 0
data1_raw$lf_fear[data1_raw$liwc.fear.norm>0] <- 1


data1_raw$hf_disgust <- 0
data1_raw$hf_disgust[data1_raw$h_disgust>0] <- 1

data1_raw$df_disgust <- 0
data1_raw$df_disgust[data1_raw$disgust>0] <- 1

data1_raw$nf_disgust <- 0
data1_raw$nf_disgust[data1_raw$nrc.disgust.norm>0] <- 1


data1_raw$hf_sadness <- 0
data1_raw$hf_sadness[data1_raw$h_sadness>0] <- 1

data1_raw$df_sadness <- 0
data1_raw$df_sadness[data1_raw$sadness>0] <- 1

data1_raw$lf_sadness <- 0
data1_raw$lf_sadness[data1_raw$liwc.sad.norm>0] <- 1

data1_raw$nf_sadness <- 0
data1_raw$nf_sadness[data1_raw$nrc.sadness.norm>0] <- 1


data1_raw$hf_joy <- 0
data1_raw$hf_joy[data1_raw$h_joy>0] <- 1

data1_raw$df_joy <- 0
data1_raw$df_joy[data1_raw$joy>0] <- 1

data1_raw$nf_joy <- 0
data1_raw$nf_joy[data1_raw$nrc.joy.norm>0] <- 1


data1_raw$hf_enthusiasm <- 0
data1_raw$hf_enthusiasm[data1_raw$h_enthusiasm>0] <- 1

data1_raw$df_enthusiasm <- 0
data1_raw$df_enthusiasm[data1_raw$enthusiasm>0] <- 1


data1_raw$hf_pride <- 0
data1_raw$hf_pride[data1_raw$h_pride>0] <- 1

data1_raw$df_pride <- 0
data1_raw$df_pride[data1_raw$pride>0] <- 1


data1_raw$hf_hope <- 0
data1_raw$hf_hope[data1_raw$h_hope>0] <- 1

data1_raw$df_hope <- 0
data1_raw$df_hope[data1_raw$hope>0] <- 1


data1_prepared <- data1_raw

save(data1_prepared, file = "./data1_prepared.Rdata")
##### Create Training and Test Data #############################################
set.seed(1111)
sample_size <- floor(.90 * nrow(data1_prepared))
train_ind <- sample(nrow(data1_prepared), size = sample_size)

test_data <- data1_prepared[-train_ind,]
training_data <- data1_prepared[train_ind,]




##### Word Embeddings #########################################################
# To apply the 'simple' neural networks classifiers based on word embeddings, 
# we firstly create a corpus from our test_data sentences

cgcorpus <- corpus(test_data$Text)

# Create a document feature matrix and conduct pre-processing
cgdfm <- dfm(cgcorpus, remove=stopwords("german"), verbose=TRUE, tolower = TRUE)

# Stemming
cgdfm <- dfm_wordstem(cgdfm, language = "german")

#Now, we will convert the word embeddings to a data frame, and then we will 
#match the features from each document with their corresponding embeddings

#First, we load the locally trained word embeddings into R
#Note: the code for training the word embeddings can be found in the file 04_training_embeddings.R
w2v <- readr::read_delim("./vec_ed_preprocessed.txt", 
                         skip=1, delim=" ", quote="",
                         col_names=c("word", paste0("V", 1:100)))

# Stem the terms included in the embeddings to increase matches
w2v$word <- text_tokens(w2v$word, stemmer = "de")

# extracting word embeddings for words in corpus
w2v <- w2v[w2v$word %in% featnames(cgdfm),]


# creating new feature matrix for embeddings
# Note: this code is based on Barbera's tutorial
embed <- matrix(NA, nrow=ndoc(cgdfm), ncol=100)
for (i in 1:ndoc(cgdfm)){
  if (i %% 100 == 0) message(i, '/', ndoc(cgdfm))
  # extract word counts
  vec <- as.numeric(cgdfm[i,])
  # keep words with counts of 1 or more
  doc_words <- featnames(cgdfm)[vec>0]
  # extract embeddings for those words
  embed_vec <- w2v[w2v$word %in% doc_words, 2:101]
  # aggregate from word- to document-level embeddings by taking AVG
  embed[i,] <- colMeans(embed_vec, na.rm=TRUE)
  # if no words in embeddings, simply set to 0
  if (nrow(embed_vec)==0) embed[i,] <- 0
}

# After we created the sentence embeddings, we apply the trained machine learning 
# models for each emotion
# Note: the code for training of the models can be found in the file 03_training_models.R

model <- load_model_hdf5("./keras_anger90", custom_objects = NULL, compile = TRUE)
wb.anger <- model %>% predict_classes(embed)
test_data <- cbind(test_data, wb.anger)

model <- load_model_hdf5("./keras_fear90", custom_objects = NULL, compile = TRUE)
wb.fear <- model %>% predict_classes(embed)
test_data <- cbind(test_data, wb.fear)

model <- load_model_hdf5("./keras_disgust90", custom_objects = NULL, compile = TRUE)
wb.disgust <- model %>% predict_classes(embed)
test_data <- cbind(test_data, wb.disgust)

model <- load_model_hdf5("./keras_sadness90", custom_objects = NULL, compile = TRUE)
wb.sadness <- model %>% predict_classes(embed)
test_data <- cbind(test_data, wb.sadness)

model <- load_model_hdf5("./keras_joy90", custom_objects = NULL, compile = TRUE)
wb.joy <- model %>% predict_classes(embed)
test_data <- cbind(test_data, wb.joy)

model <- load_model_hdf5("./keras_enthusiasm90", custom_objects = NULL, compile = TRUE)
wb.enthusiasm <- model %>% predict_classes(embed)
test_data <- cbind(test_data, wb.enthusiasm)

model <- load_model_hdf5("./keras_pride90", custom_objects = NULL, compile = TRUE)
wb.pride <- model %>% predict_classes(embed)
test_data <- cbind(test_data, wb.pride)

model <- load_model_hdf5("./keras_hope90", custom_objects = NULL, compile = TRUE)
wb.hope <- model %>% predict_classes(embed)
test_data <- cbind(test_data, wb.hope)




##### ELECTRA #################
# Note that the code to apply the transformer-based Electra model is only available 
# in Python, since many state-of-the-art NLP models are exclusively available in the Python environment
# Please follow the code in file 05_apply_transformer_based_model.ipynb to apply the model in Python

# To apply the Electra model, you first need to turn the R dataframe into a csv that can be used
# in the Python script
write.csv(test_data,'./electra/electra.csv')

# Then you can read the csv file in into Python, following the code in the Python script 05_apply_transformer_based_model.ipynb 

# After using the Python script, read the results file back into the R environment
electra_results <- read.csv('./electra/electra_results.csv', header = TRUE)

# Change column names before binding it to the main dataframe
colnames(electra_results)[3] <- "el_anger"
colnames(electra_results)[4] <- "el_fear"
colnames(electra_results)[5] <- "el_disgust"
colnames(electra_results)[6] <- "el_sadness"
colnames(electra_results)[7] <- "el_joy"
colnames(electra_results)[8] <- "el_enthusiasm"
colnames(electra_results)[9] <- "el_pride"
colnames(electra_results)[10] <- "el_hope"

# Delete columns unnecessary columns
electra_results$X <- electra_results$text <- NULL

# Add results to the main dataframe
test_data <- cbind(test_data, electra_results)

save(test_data, file = "./test_data.Rdata")

#### DATA 2 #########################################################################
# Now, we repeat the same steps for dataset 2, which is based on randomly sampled sentences

load("./data2_raw.Rdata")

data2_raw <- data2_raw[is.na(data2_raw$Golden.Answer), ]
data2_raw$Golden.Answer <- NULL
data2_raw$Type <- NULL


#### Emotional Variables #########
# First, we create variables based on the answers given by the crowd-coders
# To do so, we count the number of times a sentence received a specific emotional code
# and add them together

data2_raw$h_anger <- 0
data2_raw$h_fear <- 0
data2_raw$h_disgust <- 0
data2_raw$h_sadness <- 0
data2_raw$h_joy <- 0
data2_raw$h_enthusiasm <- 0
data2_raw$h_pride <- 0
data2_raw$h_hope <- 0
data2_raw$h_none <- 0
data2_raw$h_uncodable <- 0

list1 <- as.list(data2_raw$TextID)
list1 <- unique(list1)


for (i in c(list1[1:length(list1)])){
  occur <- str_count(c(data2_raw$Answer.1[data2_raw$TextID==i], 
                       data2_raw$Answer.2[data2_raw$TextID==i],
                       data2_raw$Answer.3[data2_raw$TextID==i],
                       data2_raw$Answer.4[data2_raw$TextID==i]), "Ärger")
  data2_raw$h_anger[data2_raw$TextID==i] <- data2_raw$h_anger[data2_raw$TextID==i] + sum(occur, na.rm=T) 
  
  
  occur <- str_count(c(data2_raw$Answer.1[data2_raw$TextID==i], 
                       data2_raw$Answer.2[data2_raw$TextID==i],
                       data2_raw$Answer.3[data2_raw$TextID==i],
                       data2_raw$Answer.4[data2_raw$TextID==i]), "Angst")
  data2_raw$h_fear[data2_raw$TextID==i] <- data2_raw$h_fear[data2_raw$TextID==i] + sum(occur, na.rm=T)
  
  
  occur <- str_count(c(data2_raw$Answer.1[data2_raw$TextID==i], 
                       data2_raw$Answer.2[data2_raw$TextID==i],
                       data2_raw$Answer.3[data2_raw$TextID==i],
                       data2_raw$Answer.4[data2_raw$TextID==i]), "Ekel")
  data2_raw$h_disgust[data2_raw$TextID==i] <- data2_raw$h_disgust[data2_raw$TextID==i] + sum(occur, na.rm=T)
  
  occur <- str_count(c(data2_raw$Answer.1[data2_raw$TextID==i], 
                       data2_raw$Answer.2[data2_raw$TextID==i],
                       data2_raw$Answer.3[data2_raw$TextID==i],
                       data2_raw$Answer.4[data2_raw$TextID==i]), "Traurigkeit")
  data2_raw$h_sadness[data2_raw$TextID==i] <- data2_raw$h_sadness[data2_raw$TextID==i] + sum(occur, na.rm=T)
  
  
  occur <- str_count(c(data2_raw$Answer.1[data2_raw$TextID==i], 
                       data2_raw$Answer.2[data2_raw$TextID==i],
                       data2_raw$Answer.3[data2_raw$TextID==i],
                       data2_raw$Answer.4[data2_raw$TextID==i]), "Freude")
  data2_raw$h_joy[data2_raw$TextID==i] <- data2_raw$h_joy[data2_raw$TextID==i] + sum(occur, na.rm=T)
  
  
  
  occur <- str_count(c(data2_raw$Answer.1[data2_raw$TextID==i], 
                       data2_raw$Answer.2[data2_raw$TextID==i],
                       data2_raw$Answer.3[data2_raw$TextID==i],
                       data2_raw$Answer.4[data2_raw$TextID==i]), "Enthusiasmus")
  data2_raw$h_enthusiasm[data2_raw$TextID==i] <- data2_raw$h_enthusiasm[data2_raw$TextID==i] + sum(occur, na.rm=T)
  
  
  occur <- str_count(c(data2_raw$Answer.1[data2_raw$TextID==i], 
                       data2_raw$Answer.2[data2_raw$TextID==i],
                       data2_raw$Answer.3[data2_raw$TextID==i],
                       data2_raw$Answer.4[data2_raw$TextID==i]), "Stolz")
  data2_raw$h_pride[data2_raw$TextID==i] <- data2_raw$h_pride[data2_raw$TextID==i] + sum(occur, na.rm=T)
  
  occur <- str_count(c(data2_raw$Answer.1[data2_raw$TextID==i], 
                       data2_raw$Answer.2[data2_raw$TextID==i],
                       data2_raw$Answer.3[data2_raw$TextID==i],
                       data2_raw$Answer.4[data2_raw$TextID==i]), "Hoffnung")
  data2_raw$h_hope[data2_raw$TextID==i] <- data2_raw$h_hope[data2_raw$TextID==i] + sum(occur, na.rm=T)
  
  
  occur <- str_count(c(data2_raw$Answer.1[data2_raw$TextID==i], 
                       data2_raw$Answer.2[data2_raw$TextID==i],
                       data2_raw$Answer.3[data2_raw$TextID==i],
                       data2_raw$Answer.4[data2_raw$TextID==i]), "Keine Emotion")
  data2_raw$h_none[data2_raw$TextID==i] <- data2_raw$h_none[data2_raw$TextID==i] + sum(occur, na.rm=T)
  
  occur <- str_count(c(data2_raw$Answer.1[data2_raw$TextID==i], 
                       data2_raw$Answer.2[data2_raw$TextID==i],
                       data2_raw$Answer.3[data2_raw$TextID==i],
                       data2_raw$Answer.4[data2_raw$TextID==i]), "Nicht kodierbar")
  data2_raw$h_uncodable[data2_raw$TextID==i] <- data2_raw$h_uncodable[data2_raw$TextID==i] + sum(occur, na.rm=T)
  
}


## Delete doubles
# Once answers have been counted, we delete multiple rows in order to receive 10,000 sentences
data2_raw$UnitID <- NULL #delete unnecessary columns
data2_raw$Guru.ID <- NULL #delete unnecessary columns

data2_raw <- data2_raw[!duplicated(data2_raw$TextID),]






#### Apply tools #########################################################################
# As a next step, we apply the different dictionaries and machine learning classifiers

##### ed8 ###########

# Preprocessing
data2_raw$sent.text <- tolower(data2_raw$Text) # Everything to lower case
data2_raw$sent.text <- str_trim(data2_raw$sent.text, side = "both") #get rid of white space on both ends
data2_raw$sent.text <- gsub(" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", "", data2_raw$sent.text, fixed = FALSE) # Remove URL Links
data2_raw$sent.text <- paste(" ", data2_raw$sent.text, " ", sep="") # Add white space in the beginning and end
data2_raw$sent.text <- gsub("[[:punct:]]", "", data2_raw$sent.text, fixed = FALSE) # Remove punctuation

data2_raw$sent.text <- gsub("  ", " ", data2_raw$sent.text, fixed = TRUE) # Remove doubled whitespaces

data2_raw$sent.text2 <- data2_raw$sent.text # Copy of the sampled sent.texts


#  Load in dictionary
ed8 <- dictionary(file = "./ed8.yml",
                  format = "YAML")

# Create the function
get_ed8_emotions <- function(data){
  #Create a corpus from your data frame
  corp <- corpus(data)
  
  #Tokenize corpus and pre-process (remove punctuations, numbers, and urls)
  toks <- tokens(corp, remove_punct = TRUE, remove_numbers = TRUE, remove_url = TRUE)
  
  #Create DFM just to measure number of terms before removing stopwords
  terms_dfm <- dfm(toks)
  
  #Create bigram-compounds to include negation control
  toks_neg_bigram <- tokens_compound(toks, pattern = phrase("nicht *"))
  toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("nichts *"))
  toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("kein *"))
  toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("keine *"))
  toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("keinen *"))
  
  #Turn tokens into DFM, remove stopwords
  emo_dfm <- dfm(toks_neg_bigram, remove = stopwords("de"))
  
  #Apply dictionary
  dict_dfm_results <- dfm_lookup(emo_dfm,ed8)
  
  #Convert results back to data frame
  results_df <- cbind(data, convert(dict_dfm_results, to = 'data.frame'))
  
  #Assign length to each documents
  results_df$terms <- ntoken(terms_dfm)
  
  return(results_df)
}

# Now you can use the function on your data; simply enter a data frame with a column called "text" including the text data
data2_raw$text <- data2_raw$sent.text2
data2_raw <- get_ed8_emotions(data2_raw)

# Finally, you can create normalized emotional scores by dividing the ed8-scores by document length
data2_raw$anger.norm <- data2_raw$ed8.ANGER / data2_raw$terms
data2_raw$fear.norm <- data2_raw$ed8.FEAR / data2_raw$terms
data2_raw$disgust.norm <- data2_raw$ed8.DISGUST / data2_raw$terms
data2_raw$sadness.norm <- data2_raw$ed8.SADNESS / data2_raw$terms
data2_raw$joy.norm <- data2_raw$ed8.JOY / data2_raw$terms
data2_raw$enthusiasm.norm <- data2_raw$ed8.ENTHUSIASM / data2_raw$terms
data2_raw$pride.norm <- data2_raw$ed8.PRIDE / data2_raw$terms
data2_raw$hope.norm <- data2_raw$ed8.HOPE / data2_raw$terms


##### NRC Dictionary #############
# Now we apply the German version of the NRC dictionary 
# as included in the syuzhet package

data2_raw$nrc_data <- get_nrc_sentiment(data2_raw$sent.text2, language = "german")

data2_raw$nrc.anger.norm <- data2_raw$nrc_data$anger / data2_raw$terms
data2_raw$nrc.fear.norm <- data2_raw$nrc_data$fear / data2_raw$terms
data2_raw$nrc.disgust.norm <- data2_raw$nrc_data$disgust / data2_raw$terms
data2_raw$nrc.sadness.norm <- data2_raw$nrc_data$sadness / data2_raw$terms
data2_raw$nrc.joy.norm <- data2_raw$nrc_data$joy / data2_raw$terms


data2_raw$nrc_data <- NULL

##### Word Embeddings #########################################################
# To apply the 'simple' neural networks classifiers based on word embeddings, 
# we firstly create a corpus from our 10,000 sentences

# To apply the 'simple' neural networks classifiers based on word embeddings, we firstly create
# a corpus from our 10,000 sentences
cgcorpus <- corpus(data2_raw$Text)

# Create a document feature matrix and conduct pre-processing
cgdfm <- dfm(cgcorpus, remove=stopwords("german"), verbose=TRUE, tolower = TRUE)

# Stemming
cgdfm <- dfm_wordstem(cgdfm, language = "german")

#Now, we will convert the word embeddings to a data frame, and then we will 
#match the features from each document with their corresponding embeddings.

#First, we load the locally trained word embeddings into R
#Note: the code for training the word embeddings can be found in the file 04_training_embeddings.R
w2v <- readr::read_delim("./vec_ed_preprocessed.txt", 
                         skip=1, delim=" ", quote="",
                         col_names=c("word", paste0("V", 1:100)))

# Stem the word
w2v$word <- text_tokens(w2v$word, stemmer = "de")

# extracting word embeddings for words in corpus
w2v <- w2v[w2v$word %in% featnames(cgdfm),]


# creating new feature matrix for embeddings
embed2 <- matrix(NA, nrow=ndoc(cgdfm), ncol=100)
for (i in 1:ndoc(cgdfm)){
  if (i %% 100 == 0) message(i, '/', ndoc(cgdfm))
  # extract word counts
  vec <- as.numeric(cgdfm[i,])
  # keep words with counts of 1 or more
  doc_words <- featnames(cgdfm)[vec>0]
  # extract embed2dings for those words
  embed2_vec <- w2v[w2v$word %in% doc_words, 2:101]
  # aggregate from word- to document-level embed2dings by taking AVG
  embed2[i,] <- colMeans(embed2_vec, na.rm=TRUE)
  # if no words in embed2dings, simply set to 0
  if (nrow(embed2_vec)==0) embed2[i,] <- 0
}

#After we created the sentence embeddings, we apply the trained machine learning models for each emotion
#Note: the code for training of the models can be found in the file 03_training_models.R

model <- load_model_hdf5("./keras_anger90", custom_objects = NULL, compile = TRUE)
wb.anger <- model %>% predict_classes(embed2)
data2_raw <- cbind(data2_raw, wb.anger)

model <- load_model_hdf5("./keras_fear90", custom_objects = NULL, compile = TRUE)
wb.fear <- model %>% predict_classes(embed2)
data2_raw <- cbind(data2_raw, wb.fear)

model <- load_model_hdf5("./keras_disgust90", custom_objects = NULL, compile = TRUE)
wb.disgust <- model %>% predict_classes(embed2)
data2_raw <- cbind(data2_raw, wb.disgust)

model <- load_model_hdf5("./keras_sadness90", custom_objects = NULL, compile = TRUE)
wb.sadness <- model %>% predict_classes(embed2)
data2_raw <- cbind(data2_raw, wb.sadness)

model <- load_model_hdf5("./keras_joy90", custom_objects = NULL, compile = TRUE)
wb.joy <- model %>% predict_classes(embed2)
data2_raw <- cbind(data2_raw, wb.joy)

model <- load_model_hdf5("./keras_enthusiasm90", custom_objects = NULL, compile = TRUE)
wb.enthusiasm <- model %>% predict_classes(embed2)
data2_raw <- cbind(data2_raw, wb.enthusiasm)

model <- load_model_hdf5("./keras_pride90", custom_objects = NULL, compile = TRUE)
wb.pride <- model %>% predict_classes(embed2)
data2_raw <- cbind(data2_raw, wb.pride)

model <- load_model_hdf5("./keras_hope90", custom_objects = NULL, compile = TRUE)
wb.hope <- model %>% predict_classes(embed2)
data2_raw <- cbind(data2_raw, wb.hope)

##### LIWC Dictionary ###########
# Note that the LIWC dictionaries cannot be added to the Dataverse 
# because of copyright reasons. If you want to reproduce the findings of the paper 
# or appendix and do not have the LIWC dictionaries, you can skip this step and continue 
# with the following scripts. 
# The following scripts rely on the already prepared data file called data1_prepared.Rdata
# and data2_prepared.Rdata


# To apply the LIWC dictionary, you first turn the R dataframe into a csv that can be used by
# the LIWC application
write.csv(data2_raw,'./liwc2.csv')

# In the LIWC stand-alone application you need to choose the Internal German Dictionary 2015
# Under "Category Options", de-select all categories except the "Affect" category
# Run the analysis and then save the output file in the same directory with the name "liwc_results2.csv"

# After using the LIWC application, read the results file back into the R environment
liwc_results2 <- read.csv('./liwc_results2.csv', header = TRUE)

# Delete first row with which entails former column names
liwc_results2 <- liwc_results2[-1,]

# Only keep last columns that were created by the LIWC app
liwc_results2 <- liwc_results2[,c(52:57)]

# Rename the columns
colnames(liwc_results2)[4] <- "liwc.fear"
colnames(liwc_results2)[5] <- "liwc.anger"
colnames(liwc_results2)[6] <- "liwc.sad"

# Turn the columns into numeric, because R read them as characters
liwc_results2$liwc.anger <- gsub(",",".",liwc_results2$liwc.anger)
liwc_results2$liwc.anger <- as.numeric(liwc_results2$liwc.anger)

liwc_results2$liwc.fear <- gsub(",",".",liwc_results2$liwc.fear)
liwc_results2$liwc.fear <- as.numeric(liwc_results2$liwc.fear)

liwc_results2$liwc.sad <- gsub(",",".",liwc_results2$liwc.sad)
liwc_results2$liwc.sad <- as.numeric(liwc_results2$liwc.sad)

# Normalize the LIWC variables to bring them between 0 and 1
liwc_results2$liwc.anger.norm <- (liwc_results2$liwc.anger-min(liwc_results2$liwc.anger))/(max(liwc_results2$liwc.anger)-min(liwc_results2$liwc.anger))
liwc_results2$liwc.fear.norm <- (liwc_results2$liwc.fear-min(liwc_results2$liwc.fear))/(max(liwc_results2$liwc.fear)-min(liwc_results2$liwc.fear))
liwc_results2$liwc.sad.norm <- (liwc_results2$liwc.sad-min(liwc_results2$liwc.sad))/(max(liwc_results2$liwc.sad)-min(liwc_results2$liwc.sad))

# Add them to the main dataframe
data2_raw <- cbind(data2_raw, liwc_results2)




##### ELECTRA #################
# Note that the code to apply the transformer-based Electra model is only available 
# in Python, since many state-of-the-art NLP models are more easily available in Python environment
# Please follow the code in file 05_apply_transformer_based_model.ipynb to apply the model in Python
# This step can be skipped, files data1_prepared.Rdata and data2_prepared.Rdata 
# in the following scripts include the classification of the Electra model

# To apply the Electra model, you first turn the R dataframe into a csv that can be used
# in the Python script
write.csv(data2_raw,'./electra/electra2.csv')

# Then you can read the csv file in into Python, following the code in the Python script 05_apply_transformer_based_model.ipynb 

# After using the Python script, read the results file back into the R environment
electra_results2 <- read.csv('./electra/electra_results2.csv', header = TRUE)

# Change column names before binding it to the main dataframe
colnames(electra_results2)[3] <- "el_anger"
colnames(electra_results2)[4] <- "el_fear"
colnames(electra_results2)[5] <- "el_disgust"
colnames(electra_results2)[6] <- "el_sadness"
colnames(electra_results2)[7] <- "el_joy"
colnames(electra_results2)[8] <- "el_enthusiasm"
colnames(electra_results2)[9] <- "el_pride"
colnames(electra_results2)[10] <- "el_hope"

# Delete columns unnecessary columns
electra_results2$X <- electra_results2$text <- NULL

# Add results to the main dataframe
data2_raw <- cbind(data2_raw, electra_results2)




#### Delete uncodables #######################################################
# Delete sentences that have been coded by more than one crowd-coders as "uncodable"
data2_raw <- data2_raw[data2_raw$h_uncodable<2,]



## Add source variable
# We add the source for each sentence (Facebook or Parliamentary Speech)
load("./source_info2.Rdata")

data2_raw <- merge(data2_raw, source_info2, by = "TextID")

#### Dummy variables ########################################################

data2_raw$hf_anger <- 0
data2_raw$hf_anger[data2_raw$h_anger>0] <- 1

data2_raw$df_anger <- 0
data2_raw$df_anger[data2_raw$anger.norm>0] <- 1

data2_raw$nf_anger <- 0
data2_raw$nf_anger[data2_raw$nrc.anger.norm>0] <- 1

data2_raw$lf_anger <- 0
data2_raw$lf_anger[data2_raw$liwc.anger.norm>0] <- 1


data2_raw$hf_fear <- 0
data2_raw$hf_fear[data2_raw$h_fear>0] <- 1

data2_raw$df_fear <- 0
data2_raw$df_fear[data2_raw$fear.norm>0] <- 1

data2_raw$nf_fear <- 0
data2_raw$nf_fear[data2_raw$nrc.fear.norm>0] <- 1

data2_raw$lf_fear <- 0
data2_raw$lf_fear[data2_raw$liwc.fear.norm>0] <- 1


data2_raw$hf_disgust <- 0
data2_raw$hf_disgust[data2_raw$h_disgust>0] <- 1

data2_raw$df_disgust <- 0
data2_raw$df_disgust[data2_raw$disgust>0] <- 1

data2_raw$nf_disgust <- 0
data2_raw$nf_disgust[data2_raw$nrc.disgust.norm>0] <- 1


data2_raw$hf_sadness <- 0
data2_raw$hf_sadness[data2_raw$h_sadness>0] <- 1

data2_raw$df_sadness <- 0
data2_raw$df_sadness[data2_raw$sadness>0] <- 1

data2_raw$lf_sadness <- 0
data2_raw$lf_sadness[data2_raw$liwc.sad.norm>0] <- 1

data2_raw$nf_sadness <- 0
data2_raw$nf_sadness[data2_raw$nrc.sadness.norm>0] <- 1


data2_raw$hf_joy <- 0
data2_raw$hf_joy[data2_raw$h_joy>0] <- 1

data2_raw$df_joy <- 0
data2_raw$df_joy[data2_raw$joy>0] <- 1

data2_raw$nf_joy <- 0
data2_raw$nf_joy[data2_raw$nrc.joy.norm>0] <- 1


data2_raw$hf_enthusiasm <- 0
data2_raw$hf_enthusiasm[data2_raw$h_enthusiasm>0] <- 1

data2_raw$df_enthusiasm <- 0
data2_raw$df_enthusiasm[data2_raw$enthusiasm>0] <- 1


data2_raw$hf_pride <- 0
data2_raw$hf_pride[data2_raw$h_pride>0] <- 1

data2_raw$df_pride <- 0
data2_raw$df_pride[data2_raw$pride>0] <- 1


data2_raw$hf_hope <- 0
data2_raw$hf_hope[data2_raw$h_hope>0] <- 1

data2_raw$df_hope <- 0
data2_raw$df_hope[data2_raw$hope>0] <- 1


data2_prepared <- data2_raw

save(data2_prepared, file = "./data2_prepared.Rdata")

#### Main Analysis #############################################################
load("./test_data.RData")
load("./data1_prepared.RData")



#### Table 1 ##################################################################
# In the following loops, we take the binary variables produced by the different tools (ed8, word embeddings, ELECTRA)
# as predictions and the human judgement as "true" data
# then we calculate recall, precision and F1 scores
# The loops below do this automatically, the results can be printed below

# ed8 Dictionary
df_ed8 <- NULL
df_wb <- NULL
df_elek <- NULL
df_temp <- NULL
emotion_list <- c("anger", "fear", "disgust", "sadness", "joy", "enthusiasm", "pride", "hope")

for (i in emotion_list) {
  predict <- test_data[paste0("df_",i)]
  true <- test_data[paste0("hf_",i)]
  
  retrieved <- sum(predict)
  prec <- sum(predict & true) / retrieved
  rec <- sum(predict & true) / sum(true)
  fscore <- 2 * prec * rec / (prec + rec)
  
  emotion <- i
  tool <- "ed8"
  precision <- prec
  recall <- rec
  Fmeasure <- fscore
  actual <- sum(true)
  predicted <- sum(predict)
  
  df_temp <- data.frame(tool, emotion, actual, predicted, precision, recall, Fmeasure)
  df_ed8 <- rbind(df_ed8, df_temp)
}

# Word Embeddings
for (i in emotion_list) {
  predict <- test_data[paste0("wb.",i)]
  true <- test_data[paste0("hf_",i)]
  
  retrieved <- sum(predict)
  prec <- sum(predict & true) / retrieved
  rec <- sum(predict & true) / sum(true)
  fscore <- 2 * prec * rec / (prec + rec)
  
  emotion <- i
  tool <- "Word Embeddings"
  precision <- prec
  recall <- rec
  Fmeasure <- fscore
  actual <- sum(true)
  predicted <- sum(predict)
  
  df_temp <- data.frame(tool, emotion, actual, predicted, precision, recall, Fmeasure)
  df_wb <- rbind(df_wb, df_temp)
}

# ELECTRA
for (i in emotion_list) {
  predict <- test_data[paste0("el_",i)]
  true <- test_data[paste0("hf_",i)]
  
  retrieved <- sum(predict)
  prec <- sum(predict & true) / retrieved
  rec <- sum(predict & true) / sum(true)
  fscore <- 2 * prec * rec / (prec + rec)
  
  emotion <- i
  tool <- "ELECTRA"
  precision <- prec
  recall <- rec
  Fmeasure <- fscore
  actual <- sum(true)
  predicted <- sum(predict)
  
  df_temp <- data.frame(tool, emotion, actual, predicted, precision, recall, Fmeasure)
  df_elek <- rbind(df_elek, df_temp)
}

table1 <- rbind(df_ed8, df_wb, df_elek)

print(table1)

# Exporting table to txt
stargazer(table1, summary = FALSE, out = "./tables/table1.txt")

#### Figure 2 ##################################################################
# In Figure 2 we just count the number of occurences (as judged by human coders) for each emotion
# and plot them against the respective F1 scores (in this case we use the ELECTRA F1 scores)

occurences <- c(sum(data1_prepared$hf_anger), sum(data1_prepared$hf_fear),
                sum(data1_prepared$hf_disgust),sum(data1_prepared$hf_sadness),
                sum(data1_prepared$hf_joy),sum(data1_prepared$hf_enthusiasm),
                sum(data1_prepared$hf_pride),sum(data1_prepared$hf_hope))

Fscores <- c(df_elek$Fmeasure[1:8])

fig2_data <- data.frame(occurences, Fscores, emotion_list)

ggplot(data = fig2_data, aes(x = occurences, y = Fscores)) + 
  geom_point(color='#F8766D') +
  geom_smooth(method="lm", color = "darkgrey", se = FALSE) +
  xlab("Occurences") + ylab("F1 Score") + 
  geom_text(aes(label = emotion_list),hjust=0.4, vjust=-0.6) +
  theme_bw()

# Exporting figure to .pdf
ggsave("./figures/figure2.pdf")

#### Table 2 ##################################################################
# In the following loops, we take the binary variables produced by the off-the-shelf dictionaries
# (NRC & LIWC) as predictions and the human judgement as "true" data
# then we calculate recall, precision and F1 scores
# The loops below do this automatically, the results can be printed below

df_liwc <- NULL
df_nrc <- NULL
df_temp <- NULL

emotion_list <- c("anger", "fear", "sadness")
for (i in emotion_list) {
  predict <- test_data[paste0("lf_",i)]
  true <- test_data[paste0("hf_",i)]
  
  retrieved <- sum(predict)
  prec <- sum(predict & true) / retrieved
  rec <- sum(predict & true) / sum(true)
  fscore <- 2 * prec * rec / (prec + rec)
  
  emotion <- i
  tool <- "LIWC"
  precision <- prec
  recall <- rec
  Fmeasure <- fscore
  actual <- sum(true)
  predicted <- sum(predict)
  
  df_temp <- data.frame(tool, emotion, actual, predicted, precision, recall, Fmeasure)
  df_liwc <- rbind(df_liwc, df_temp)
}

emotion_list <- c("anger", "fear", "disgust", "sadness", "joy")
for (i in emotion_list) {
  predict <- test_data[paste0("nf_",i)]
  true <- test_data[paste0("hf_",i)]
  
  retrieved <- sum(predict)
  prec <- sum(predict & true) / retrieved
  rec <- sum(predict & true) / sum(true)
  fscore <- 2 * prec * rec / (prec + rec)
  
  emotion <- i
  tool <- "NRC"
  precision <- prec
  recall <- rec
  Fmeasure <- fscore
  actual <- sum(true)
  predicted <- sum(predict)
  
  df_temp <- data.frame(tool, emotion, actual, predicted, precision, recall, Fmeasure)
  df_nrc <- rbind(df_nrc, df_temp)
}

table2 <- rbind(df_liwc, df_nrc)

print(table2)

# Exporting table to txt
stargazer(table2, summary = FALSE, out = "./tables/table2.txt")

####################################################################################
### END OF SCRIPT ###
####################################################################################
